import os
import json
import pandas as pd
from sdv.utils import display_tables
from sdv.evaluation import evaluate
from sdmetrics.reports.multi_table import QualityReport
from sdmetrics.reports import utils
def read_tables(path):
tables = {}
for file in os.listdir(path):
if file.endswith(".csv"):
table_name = file[:-4].split("-")[-1]
table = pd.read_csv(path + file)
tables[table_name] = table
return tables
real_data = read_tables("../data/zurich_source/")
synthetic_data = read_tables("../data/zurich_synthetic/")
# read metadata from ../models/zurich-sdv-metadata.json
metadata_name = '../models/zurich-sdv-1.0-metadata.json'
with open(metadata_name) as metadata_file:
metadata = json.load(metadata_file)
The complete list of possible metrics is:
cstest: This metric compares the distributions of all the
categorical columns of the table by using a Chi-squared test and
returns the average of the p-values obtained across all the
columns. If the tables that you are evaluating do not contain any
categorical columns the result will be nan.kscomplement: This metric compares the distributions of all the
numerical columns of the table with a two-sample Kolmogorov-Smirnov
test using the empirical CDF and returns the average of the
KS statistic values obtained across all the columns. If the tables
that you are evaluating do not contain any numerical columns the result
will be nan.logistic_detection: This metric tries to use a Logistic Regression
classifier to detect whether each row is real or synthetic and then
evaluates its performance using an Area under the ROC curve metric.
The returned score is 1 minus the ROC AUC score obtained by the
classifier.svc_detection: This metric tries to use an Support Vector
Classifier to detect whether each row is real or synthetic and then
evaluates its performance using an Area under the ROC curve metric.
The returned score is 1 minus the ROC AUC score obtained by the
classifier.evaluate(
synthetic_data,
real_data,
aggregate=False,
metadata=metadata)
| metric | name | raw_score | normalized_score | min_value | max_value | goal | error | |
|---|---|---|---|---|---|---|---|---|
| 0 | KSComplement | Inverted Kolmogorov-Smirnov D statistic | 0.547242 | 0.547242 | 0.0 | 1.0 | MAXIMIZE | None |
| 1 | CSTest | Chi-Squared | 0.975976 | 0.975976 | 0.0 | 1.0 | MAXIMIZE | None |
report = QualityReport()
report.generate(real_data, synthetic_data, metadata)
Creating report: 0%| | 0/5 [00:00<?, ?it/s]/Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) /Users/martinjurkovic/.pyenv/versions/3.8.12/envs/ds_project/lib/python3.8/site-packages/sdmetrics/column_pairs/statistical/correlation_similarity.py:60: ConstantInputWarning: One or both of the input arrays is constant. The CorrelationSimilarity metric is either undefined or infinite. warnings.warn(ConstantInputWarning(msg)) Creating report: 100%|██████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 9.51it/s]
Overall Quality Score: 64.32% Properties: Column Shapes: 61.69% Column Pair Trends: 67.55% Parent Child Relationships: 63.71%
# save report to file
# report.save(filepath='../reports/quality_report-zurich-sdmetrics-1.0.pkl')
report.get_details(
property_name='Column Shapes',
table_name='customers')
| Table | Column | Metric | Quality Score | |
|---|---|---|---|---|
| 0 | customers | date_of_birth | KSComplement | 0.798197 |
| 1 | customers | household_id | KSComplement | 0.500711 |
| 2 | customers | customer_type | TVComplement | 0.984283 |
| 3 | customers | gender | TVComplement | 0.876720 |
| 4 | customers | country_part | TVComplement | 0.909256 |
| 5 | customers | household_role | TVComplement | 0.447368 |
fig = report.get_visualization(
property_name='Column Shapes',
table_name='customers')
fig.show()
fig = report.get_visualization(
property_name='Column Pair Trends',
table_name='customers')
fig.show()
fig = report.get_visualization(
property_name='Column Shapes',
table_name='policy')
fig.show()
fig = report.get_visualization(
property_name='Column Pair Trends',
table_name='policy')
fig.show()
fig = report.get_visualization(
property_name='Column Shapes',
table_name='claim')
fig.show()
fig = report.get_visualization(
property_name='Column Pair Trends',
table_name='claim')
fig.show()
fig = report.get_visualization(
property_name='Parent Child Relationships')
fig.show()
# plot for all columns in data
def plot_all_columns(real_data, synthetic_data, metadata):
for table_name, table in real_data.items():
for column_name in table.columns:
try:
fig = utils.get_column_plot(
real_data=real_data[table_name],
synthetic_data=synthetic_data[table_name],
column_name=column_name,
metadata=metadata['tables'][table_name]
)
fig.show()
except Exception as e:
print(f"Could not plot {table_name}.{column_name}")
print(e)
plot_all_columns(real_data, synthetic_data, metadata)
Could not plot customers.customer_id sdtype of type 'id' not recognized.
Could not plot claim.claim_recovered The data appears to lie in a lower-dimensional subspace of the space in which it is expressed. This has resulted in a singular data covariance matrix, which cannot be treated using the algorithms implemented in `gaussian_kde`. Consider performing principle component analysis / dimensionality reduction and using `gaussian_kde` with the transformed data.
Could not plot claim.policy_id sdtype of type 'id' not recognized. Could not plot claim.claim_id sdtype of type 'id' not recognized.
Could not plot policy.policy_id sdtype of type 'id' not recognized.
Could not plot policy.customer_id sdtype of type 'id' not recognized.
real_data['customers'].head()
| customer_id | customer_type | gender | country_part | date_of_birth | household_id | household_role | |
|---|---|---|---|---|---|---|---|
| 0 | 453 | business | NaN | south | NaN | NaN | NaN |
| 1 | 608 | business | NaN | north_east | NaN | NaN | NaN |
| 2 | 980 | business | NaN | north_east | NaN | NaN | NaN |
| 3 | 997 | business | NaN | north_east | NaN | NaN | NaN |
| 4 | 1298 | business | NaN | centre | NaN | NaN | NaN |